We will analyse the office data
office_ratings <-
readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv')
office_ratings
## # A tibble: 188 x 6
## season episode title imdb_rating total_votes air_date
## <dbl> <dbl> <chr> <dbl> <dbl> <date>
## 1 1 1 Pilot 7.6 3706 2005-03-24
## 2 1 2 Diversity Day 8.3 3566 2005-03-29
## 3 1 3 Health Care 7.9 2983 2005-04-05
## 4 1 4 The Alliance 8.1 2886 2005-04-12
## 5 1 5 Basketball 8.4 3179 2005-04-19
## 6 1 6 Hot Girl 7.8 2852 2005-04-26
## 7 2 1 The Dundies 8.7 3213 2005-09-20
## 8 2 2 Sexual Harassment 8.2 2736 2005-09-27
## 9 2 3 Office Olympics 8.4 2742 2005-10-04
## 10 2 4 The Fire 8.4 2713 2005-10-11
## # … with 178 more rows
We are going to explore the ratings from IMDB.
library(ggplot2)
library(dplyr)
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = season, y = imdb_rating, group = season, colour = season)) +
geom_boxplot() +
labs(title = "Boxplots of IMDB Ratings per Season", x = "Season", y = "IMDB Rating") +
theme_light() +
geom_vline(xintercept = 7.5, linetype = 'dashed') +
annotate("text", x = 8.5, y = 6, label = "Micheal Scott\n Leaves")
library(knitr)
office_ratings %>%
group_by(season) %>%
summarise(mean_rating = mean(imdb_rating),
highest_rating = max(imdb_rating),
lowest_rating = min(imdb_rating)) -> season_summary
kable(season_summary)
| season | mean_rating | highest_rating | lowest_rating |
|---|---|---|---|
| 1 | 8.016667 | 8.4 | 7.6 |
| 2 | 8.436364 | 9.3 | 7.9 |
| 3 | 8.573913 | 9.3 | 8.0 |
| 4 | 8.600000 | 9.3 | 7.9 |
| 5 | 8.492308 | 9.6 | 8.1 |
| 6 | 8.219231 | 9.3 | 6.8 |
| 7 | 8.316667 | 9.7 | 7.5 |
| 8 | 7.666667 | 8.2 | 6.7 |
| 9 | 7.956522 | 9.7 | 7.1 |
office_ratings
## # A tibble: 188 x 6
## season episode title imdb_rating total_votes air_date
## <dbl> <dbl> <chr> <dbl> <dbl> <date>
## 1 1 1 Pilot 7.6 3706 2005-03-24
## 2 1 2 Diversity Day 8.3 3566 2005-03-29
## 3 1 3 Health Care 7.9 2983 2005-04-05
## 4 1 4 The Alliance 8.1 2886 2005-04-12
## 5 1 5 Basketball 8.4 3179 2005-04-19
## 6 1 6 Hot Girl 7.8 2852 2005-04-26
## 7 2 1 The Dundies 8.7 3213 2005-09-20
## 8 2 2 Sexual Harassment 8.2 2736 2005-09-27
## 9 2 3 Office Olympics 8.4 2742 2005-10-04
## 10 2 4 The Fire 8.4 2713 2005-10-11
## # … with 178 more rows
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = imdb_rating, y = total_votes, colour = season)) +
geom_point() +
labs(title = "Votes vs Rating", x = "IMDB Rating", y = "Total Votes")
library(plotly)
office_ratings %>%
mutate(season = as.factor(season)) %>%
ggplot(aes(x = imdb_rating, y = total_votes, colour = season,
text = paste0("S", season, ".E", episode, " ", title,
"<br>IMDB: ", imdb_rating,
"<br>No. Votes: ", total_votes,
"<br>Season: ", season))) +
geom_point() +
labs(title = "Votes vs Rating", x = "IMDB Rating", y = "Total Votes") -> votes_rating_plot
ggplotly(votes_rating_plot, tooltip = "text")